Project Description and Summary (NEED GOAL, APPROACH, and CONCLUSION AND THIS NEEDS TO BE ONE PAGE?!!?)

The goal of this project is to find the most accurate model using a variety of statistical model building techniques in order to predict medical outcomes relating to breast cancer. We will use the BRCA Multi-Omics (TCGA) data from Kaggle, and predict outcomes for the PR.Status, ER.Status, HER2.Final.Status, and histological.type. For our approach, we will first perform the necessary data cleaning operations, then [method 1] and [method2] to build classification models for PR.Status and [method1] and [method2] for histological.type, using classification error and AUC respectively as the evalution criteria. Next, we will use [method] to build a model with the goal of accurately predicting all four outcomes. Finally, [INSERT CONCLUSION HERE].

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
brca_data = read.csv("https://raw.githubusercontent.com/mgarbvs/STAT-432-final-project/main/cleaned_brca_data.csv")
# remove X.1 and X columns
brca_data = brca_data %>% select(-c("X.1", "X"))
head(brca_data, 10)

Structure of data:

#str(brca_data)

PR.Status, ER.Status, and HER2.Final.Status are determined using immunohistochemistry scoring. For these variables, we will only consider two levels: “Positive” and “Negative”. For histological.type, we will only consider “infiltrating lobular carcinoma” and “infiltrating ductal carcinoma”. You can treat all other categories as missing values. Hence, all four outcomes should be binary.

which(colnames(brca_data)=="PR.Status")
## [1] 1937
which(colnames(brca_data)=="ER.Status")
## [1] 1938
which(colnames(brca_data)=="HER2.Final.Status")
## [1] 1939
which(colnames(brca_data)=="histological.type")
## [1] 1940

Check for null values:

colSums(is.na(brca_data[, 1937:1940]))
##         PR.Status         ER.Status HER2.Final.Status histological.type 
##                 0                 0                 0                 0

– check for closely correlated variables and leave only one

suppressMessages(library(caret))
suppressMessages(library(dplyr))
# store the predictors
predictors = brca_data %>% select(c("PR.Status", "ER.Status", "HER2.Final.Status", "histological.type"))
# remove the categorical var temporarily
brca_data = brca_data %>% select(-c("PR.Status", "ER.Status", "HER2.Final.Status", "histological.type"))


# use correlation matrix
#cor_mat = cor(brca_data)
# returns vector of indices to remove
#cor_list = findCorrelation(cor_mat, cutoff = .8)

Drop the highly correlated columns

#brca_data =  brca_data[, -c(cor_list)]
#brca_data

We still have 1087 col left…

# categorical variables
brca_data[605:1464]
brca_data[1465:1713]
# only use variances using the corr mat
non_categorical_brca = cbind(brca_data[1:604], brca_data[1714:1936])

# use correlation matrix
cor_mat = cor(non_categorical_brca)

# returns vector of indices to remove
cor_list = findCorrelation(cor_mat, cutoff = .7)
non_categorical_brca =  non_categorical_brca[, -c(cor_list)]
non_categorical_brca

Modeling with KNN (predict PR.Status)

Split the data:

## 75% of the sample size
smp_size <- floor(0.75 * nrow(non_categorical_brca))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(non_categorical_brca)), size = smp_size)

brca_train <- as.matrix(non_categorical_brca[train_ind, ])
brca_test <- as.matrix(non_categorical_brca[-train_ind, ])


train_y = as.matrix(predictors$PR.Status[train_ind])
brca_train_1 = as.matrix(cbind(brca_train, train_y))

{r} # library(kknn) # # control <- trainControl(method = "cv", number = 10) # knn.cvfit <- train(y ~ ., method = "knn", # data = data.frame("x" = brca_train_1, "y" = as.factor(predictors$PR.Status[train_ind])), # tuneGrid = data.frame(k = seq(1, 40, 1)), # trControl = control) #

{r} # plot(knn.cvfit$results$k, 1-knn.cvfit$results$Accuracy, # xlab = "K", ylab = "Classification Error", type = "b", # pch = 19, col = "darkorange") #

{r} # which.min(1-knn.cvfit$results$Accuracy) #

The best k is 10

library(caret)
library(class)
train_y = as.matrix(predictors$PR.Status[train_ind])
# brca_train = cbind(brca_train, train_y)

y = as.matrix(predictors$PR.Status[train_ind])

pred = knn.fit = knn(train = brca_train, test = brca_test, cl = y, k = 10)

cmat = table(pred, as.factor(predictors$PR.Status[-train_ind]))

cmat
##           
## pred       Negative Positive
##   Negative       23        3
##   Positive       20       81

Accuracy:

(81 + 23)/sum(cmat)
## [1] 0.8188976

Modeling with Lasso (predict PR.Status)

suppressMessages(library(glmnet))
set.seed(1)

lasso.fit = cv.glmnet(brca_train, train_y, nfolds = 10, alpha = 1, type.measure = "class", family = "binomial")

par(mfrow = c(1, 2))
    plot(lasso.fit)
    plot(lasso.fit$glmnet.fit, "lambda")

# {r} # coef(lasso.fit, s = "lambda.min") #

{r} # coef(lasso.fit, s = "lambda.1se") #

Predict on the test data:

best_lambda = lasso.fit$lambda.min
test_predict = predict(lasso.fit, brca_test, s = best_lambda, type = "class")
# test_predict
# length(test_predict)
# length(predictors$PR.Status[-train_ind])
cmat_1 = table(predictors$histological.type[-train_ind], test_predict)
cmat_1
##                                 test_predict
##                                  Negative Positive
##   infiltrating ductal carcinoma        29       85
##   infiltrating lobular carcinoma        1       12

Accuracy:

(25 + 79)/sum(cmat_1)
## [1] 0.8188976

The classification error is 0.1811024

Modelling histological.type with Logistic (with Ridge and Lasso penalty)

Next, let us model histological.type, using AUC as the evaluation criterion.

First, use Penalized Logistic Regression:

library(glmnet)
require(methods)

train_y_hist= as.matrix(predictors$histological.type[train_ind])
brca_train_2 = as.matrix(cbind(brca_train, train_y_hist))

# need to force matrix as dgCMatrix for some reason??: https://stackoverflow.com/questions/8458233/r-glmnet-as-matrix-error-message

# using the train data and 10-fold CV
logistic.fit = cv.glmnet(x = as(data.matrix(brca_train_2[, -c(640)]), "dgCMatrix"), y = brca_train_2[, 640], nfold = 10, family = "binomial")

plot(logistic.fit)

Next, use this model to predict on the test data:

library(ROCR)
test_predict <- predict(logistic.fit, brca_test, type="response")


roc <- prediction(test_predict, predictors$histological.type[-train_ind])

# calculates the ROC curve
perf <- performance(roc,"tpr","fpr")
plot(perf,colorize=TRUE)

The AUC:

performance(roc, measure = "auc")@y.values[[1]]
## [1] 0.9473684

Next, try with Ridge:

library(glmnet)
set.seed(432)
ridge.fit = cv.glmnet(as(data.matrix(brca_train_2[, -c(640)]), "dgCMatrix"), y = brca_train_2[, 640], nfolds = 10, alpha = 0, family = "binomial")
plot(ridge.fit$glmnet.fit, "lambda")

Next, use this model to predict on the test data:

library(ROCR)
test_predict <- predict(ridge.fit, brca_test, type="response")

roc <- prediction(test_predict, predictors$histological.type[-train_ind])

# calculates the ROC curve
perf <- performance(roc,"tpr","fpr")
plot(perf,colorize=TRUE)

The AUC:

performance(roc, measure = "auc")@y.values[[1]]
## [1] 0.8407557

The AUC for logistic regression with ridge penalty is lower than the lasso penalty.

Modelling histological.type with Kernel

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
set.seed(432)
rf.fit = randomForest(data.matrix(brca_train_2[, -c(640)]), y = as.factor(brca_train_2[, 640]), ntree = 4, mtry = 4, nodesize = 10, sampsize = 50)

Next, use this model to predict on the test data:

library(ROCR)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
test.predictions <- predict(rf.fit, type="prob", newdata=brca_test)[,2]

correct_hist = as.factor(predictors$histological.type[-train_ind])
rf_pr_test <- prediction(test.predictions, correct_hist)
r_auc_test <- performance(rf_pr_test, "tpr","fpr")
plot(r_auc_test, colorize=TRUE)

The AUC:

performance(rf_pr_test, measure = "auc")@y.values[[1]] 
## [1] 0.7672065

this is bad